################################################################################
# 02_clean_ipeds_data.R
# Clean IPEDS Enrollment and Graduation Data
#
# Inputs:
#   - IPEDS API (via educationdata package) or
#   - data/raw/ipeds/ folder with pre-downloaded CSV files
#
# Outputs:
#   - data/cleaned/ipeds_data_cleaned.xlsx
################################################################################

library(data.table)
library(readxl)
library(writexl)
library(tidyverse)
library(purrr)

# Check if educationdata package is available
use_api <- requireNamespace("educationdata", quietly = TRUE)

if (use_api) {
  library(educationdata)
  cat("  Using IPEDS API to download data...\n")
} else {
  cat("  Note: educationdata package not installed. Using pre-downloaded data.\n")
  cat("  To install: install.packages('educationdata')\n")
}

# ──────────────────────────────────────────────────────────────────────────────
# Option 1: Load Pre-Cleaned Data (faster for replication)
# ──────────────────────────────────────────────────────────────────────────────

if (file.exists("data/cleaned/ipeds_data_cleaned.xlsx")) {
  cat("  Found existing cleaned IPEDS data. Skipping download.\n")
  cat("  Delete data/cleaned/ipeds_data_cleaned.xlsx to re-download.\n")
} else if (!use_api) {
  stop("IPEDS data not found at 'data/cleaned/ipeds_data_cleaned.xlsx' and the 'educationdata' R package is not installed.\n",
       "Either run 02_clean_ipeds_data.R with the educationdata package installed, or provide the pre-cleaned file.")

} else {
  # ──────────────────────────────────────────────────────────────────────────────
  # Option 2: Download from IPEDS API
  # ──────────────────────────────────────────────────────────────────────────────

  cat("  Downloading IPEDS data (this may take several minutes)...\n")

  topics <- c("directory", "completions-cip-2", "fall-enrollment",
              "institutional-characteristics", "admissions-requirements",
              "admissions-enrollment", "grad-rates", "student-faculty-ratio",
              "sfa-all-undergraduates", "fall-retention")

  # Download each topic
  for (topic in topics) {
    cat("    Downloading:", topic, "\n")

    subtopic <- if (topic == 'fall-enrollment') list('race', 'sex') else NULL

    tryCatch({
      data <- get_education_data(
        level = 'college-university',
        source = 'ipeds',
        topic = topic,
        subtopic = subtopic,
        csv = TRUE,
        add_labels = TRUE
      )
      var_name <- paste0("data_", gsub("-", "_", topic))
      assign(var_name, data)
    }, error = function(e) {
      cat("      Error downloading", topic, "\n")
    })
  }

  # ──────────────────────────────────────────────────────────────────────────────
  # Process Completions Data (CIP 2-digit - Education)
  # ──────────────────────────────────────────────────────────────────────────────

  setDT(data_completions_cip_2)

  # Filter to education graduates
  filtered_data <- data_completions_cip_2 %>%
    filter(cipcode == "Education",
           award_level %in% c("Bachelor's degree", "Master's degree"),
           year >= 2000,
           race %in% c("White", "Black", "Hispanic", "Total"),
           sex %in% c("Male", "Female", "Total"))

  # Summarize by university-year
  data_completions_cip2 <- filtered_data %>%
    group_by(unitid, year, award_level, sex, race) %>%
    summarise(total_completions = sum(awards, na.rm = TRUE), .groups = "drop") %>%
    group_by(unitid, year) %>%
    summarise(
      ba_educationcip2_total_completions = sum(total_completions[sex == "Total" & race == "Total" & award_level == "Bachelor's degree"], na.rm = TRUE),
      ba_educationcip2_male_completions = sum(total_completions[sex == "Male" & race == "Total" & award_level == "Bachelor's degree"], na.rm = TRUE),
      ba_educationcip2_female_completions = sum(total_completions[sex == "Female" & race == "Total" & award_level == "Bachelor's degree"], na.rm = TRUE),
      ma_educationcip2_total_completions = sum(total_completions[sex == "Total" & race == "Total" & award_level == "Master's degree"], na.rm = TRUE),
      .groups = "drop"
    )

  # ──────────────────────────────────────────────────────────────────────────────
  # Process CIP 6-digit Completions (Teacher Preparation Programs)
  # ──────────────────────────────────────────────────────────────────────────────

  # Define teacher preparation CIP codes
  cip_codes <- c("130101",
                 sprintf("1302%02d", 1:99),
                 "130301",
                 sprintf("1310%02d", 1:99),
                 sprintf("1312%02d", 1:99),
                 sprintf("1313%02d", 1:99),
                 sprintf("1314%02d", 1:99),
                 "139999")

  years <- 2008:2020
  all_cip6_data <- list()

  for (year in years) {
    cat("    Downloading CIP-6 completions for", year, "\n")
    tryCatch({
      cip6_completions <- get_education_data(
        level = "college-university",
        source = "ipeds",
        topic = "completions-cip-6",
        filters = list(year = year),
        csv = TRUE,
        add_labels = TRUE
      )

      filtered <- cip6_completions %>%
        filter(cipcode_6digit %in% cip_codes,
               award_level %in% c("Bachelor's degree", "Master's degree"),
               sex %in% c("Male", "Female", "Total"),
               race %in% c("White", "Black", "Hispanic", "Total"),
               majornum %in% c("First major", "Second major")) %>%
        group_by(unitid, year) %>%
        summarise(
          ba_teacher_preparation_completions_total = sum(awards_6digit[sex == "Total" & race == "Total" & award_level == "Bachelor's degree"], na.rm = TRUE),
          ma_teacher_preparation_completions_total = sum(awards_6digit[sex == "Total" & race == "Total" & award_level == "Master's degree"], na.rm = TRUE),
          ba_masters_teacher_preparation_completions_total = sum(awards_6digit[sex == "Total" & race == "Total"], na.rm = TRUE),
          .groups = "drop"
        )

      all_cip6_data[[as.character(year)]] <- filtered
    }, error = function(e) {
      cat("      Error for year", year, "\n")
    })
  }

  data_completions_cip_6 <- bind_rows(all_cip6_data)

  # ──────────────────────────────────────────────────────────────────────────────
  # Process Other IPEDS Tables
  # ──────────────────────────────────────────────────────────────────────────────

  # Institutional characteristics
  data_institutional_characteristics <- data_institutional_characteristics %>%
    select(unitid, year, inst_affiliation, primary_public_control, bach_offered,
           masters_offered, enrolled_undergrad_fulltime, teacher_cert)

  # Admissions requirements
  data_admissions_requirements <- data_admissions_requirements %>%
    select(unitid, year, sat_crit_read_25_pctl, sat_crit_read_75_pctl,
           sat_math_25_pctl, sat_math_75_pctl, act_composite_25_pctl,
           act_composite_75_pctl, sat_crit_read_50_pctl, sat_math_50_pctl) %>%
    filter(year >= 2008 & year <= 2020)

  # Student financial aid (Pell grants)
  data_sfa_all_undergraduates <- data_sfa_all_undergraduates %>%
    filter(level_of_study == "Undergraduate",
           type_of_aid == "Pell grant",
           ftpt == "Total") %>%
    rename(pell_percent = percent_of_students,
           pell_average_amount = average_amount) %>%
    select(unitid, year, pell_percent, pell_average_amount)

  # Directory info
  data_directory <- data_directory %>%
    select(unitid, year, inst_name, state_abbr, fips, hbcu, inst_control)

  # ──────────────────────────────────────────────────────────────────────────────
  # Merge All IPEDS Data
  # ──────────────────────────────────────────────────────────────────────────────

  data_ipeds <- data_completions_cip_6 %>%
    left_join(data_completions_cip2, by = c("unitid", "year")) %>%
    left_join(data_institutional_characteristics, by = c("unitid", "year")) %>%
    left_join(data_admissions_requirements, by = c("unitid", "year")) %>%
    left_join(data_directory, by = c("unitid", "year")) %>%
    left_join(data_sfa_all_undergraduates, by = c("unitid", "year"))

  # ──────────────────────────────────────────────────────────────────────────────
  # Save Output
  # ──────────────────────────────────────────────────────────────────────────────

  write_xlsx(data_ipeds, "data/cleaned/ipeds_data_cleaned.xlsx")

  cat("  Created: data/cleaned/ipeds_data_cleaned.xlsx\n")
  cat("  Universities:", n_distinct(data_ipeds$unitid), "\n")
  cat("  Years:", min(data_ipeds$year), "-", max(data_ipeds$year), "\n")

  # Clean up
  rm(list = ls(pattern = "^data_"))
  gc()
}

cat("  IPEDS data processing complete.\n")
